We will investigate protein-protein interactions (PPIs) for a given list of proteins. To do so, we will retreive PPI data from the STRING database and visualize networks in Cytoscape using the R package RCy3. We will further cluster the network to investigate whether subsets of proteins share a particular molecular function.
Cytoscape running, with the clusterMaker app installed.
library(STRINGdb)
library(RCy3)
library(dplyr)
library(purrr)
library(tidyr)
library(readxl)
library(readr)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)## apiVersion cytoscapeVersion
## "v1" "3.10.2"
## [1] "IFI30" "IFITM2" "FGL2" "STON1" "FAM129A" "SERPINA3"
# Create a dataframe with uppercase letters (to match with STRING alias mapping)
data <- data.frame(query.term = toupper(proteins))
# check if file with mapping to STRING aliases exists
# and create it if it doesn't
if (file.exists("data/mapped2STRING.csv")) {
mapped <- read.csv("data/mapped2STRING.csv")
} else {
mapped <- string_db$map(my_data_frame = data,
my_data_frame_id_col_names = "query.term",
takeFirst=TRUE,
removeUnmappedRows=TRUE,
quiet=FALSE)
write.csv(x = mapped, file = "data/mapped2STRING.csv", quote = FALSE, row.names = FALSE)
}
head(mapped)## query.term STRING_id
## 1 IFI30 9606.ENSP00000384886
## 2 IFITM2 9606.ENSP00000484689
## 3 FGL2 9606.ENSP00000248598
## 4 STON1 9606.ENSP00000384615
## 5 FAM129A 9606.ENSP00000356481
## 6 SERPINA3 9606.ENSP00000450540
# Check how many proteins were not mapped
# Since some gene IDs can map to several STRING identifiers, we account for duplicates
length(unique(data$query.term)) - length(unique(mapped$query.term))## [1] 1
# Get interactions for the mapped proteins if the file isn't already there
# check if file with interaction data exists
# and create it if it doesn't
if (file.exists("data/interactions.csv")) {
interactions <- read.csv("data/interactions.csv")
} else {
interactions <- string_db$get_interactions(mapped$STRING_id)
write.csv(x = interactions, file = "data/interactions.csv", quote = FALSE, row.names = FALSE)
}
# View the first few interactions
head(interactions)## from to combined_score
## 1 9606.ENSP00000162749 9606.ENSP00000216117 435
## 2 9606.ENSP00000162749 9606.ENSP00000216117 435
## 3 9606.ENSP00000009530 9606.ENSP00000225831 497
## 4 9606.ENSP00000009530 9606.ENSP00000225831 497
## 5 9606.ENSP00000162749 9606.ENSP00000225831 768
## 6 9606.ENSP00000162749 9606.ENSP00000225831 768
Define nodes for the network
# Get node columns for the Cytoscape network
nodes <- data.frame(id = unique(c(interactions$from, interactions$to)))
# Merge with the mapped protein names to include original protein names as labels
nodes <- merge(nodes, mapped[, c("STRING_id", "query.term")],
by.x = "id", by.y = "STRING_id",
all.x = TRUE,
all.y = TRUE)
head(nodes)## id query.term
## 1 9606.ENSP00000009530 CD74
## 2 9606.ENSP00000162749 TNFRSF1A
## 3 9606.ENSP00000206423 CCDC80
## 4 9606.ENSP00000209929 FMO2
## 5 9606.ENSP00000216117 HMOX1
## 6 9606.ENSP00000225831 CCL2
Define edges for the network
edges <- data.frame(source = interactions$from, target = interactions$to)
# Remove directionality of edges (necessary for various clusterings later)
# Combine and sort the columns, then get unique rows
unique_edges <- unique(t(apply(edges, 1, function(x) sort(x))))
# Convert back to a dataframe
edges <- as.data.frame(unique_edges, stringsAsFactors = FALSE)
# Rename the columns if needed
colnames(edges) <- c("source", "target")
head(edges)## source target
## 1 9606.ENSP00000162749 9606.ENSP00000216117
## 2 9606.ENSP00000009530 9606.ENSP00000225831
## 3 9606.ENSP00000162749 9606.ENSP00000225831
## 4 9606.ENSP00000216117 9606.ENSP00000225831
## 5 9606.ENSP00000225831 9606.ENSP00000245907
## 6 9606.ENSP00000245907 9606.ENSP00000246006
## You are connected to Cytoscape!
# Create a new Cytoscape network from your data
createNetworkFromDataFrames(nodes = nodes,
edges = edges,
# title = mytitle,
collection = "My Collection")## Loading data...
## Applying default style...
## Applying preferred layout...
## networkSUID
## 9707
# Set node labels to the original protein names, and other visual tweaks
setNodeLabelMapping('query.term')## style.name not specified, so updating "default" style.
## NULL
## style.name not specified, so updating "default" style.
## style.name not specified, so updating "default" style.
Now we modify the network layout in the app to reduce overlap of nodes and make it overall more aesthetically pleasing. Parameters will depend on the network
# Make sure network fits in the frame to be saved
fitContent()
#save network image
exportImage(filename = "currentnetwork.png") ## file
## "/Users/inikaprasad/Desktop/Biostats2024/func-omics/currentnetwork.png"
commands specify: restoreEdges: restores edges after clustering, showUI: displays the new network, and undirectedEdges: assumes edges are undirected
Run GLay community clustering
# Run GLay community clustering
RCy3::commandsRun("cluster glay restoreEdges= true showUI = true undirectedEdges = true")## [1] " Clusters: 14" "Average size: 4,429" "Maximum size: 18"
## [4] "Minimum size: 1" "Modularity: 0,589"
# reduce overlaps by making the nodes less inclined to be close to each other
layoutNetwork('force-directed defaultSpringCoefficient=0.000006 defaultSpringLength=1')Visualize network after clustering
# Make sure network fits in the frame to be saved
fitContent()
#save network image
exportImage(filename = "currentnetwork_clustered.png") ## file
## "/Users/inikaprasad/Desktop/Biostats2024/func-omics/currentnetwork_clustered.png"
# Get the table with clustering results
network_table <- getTableColumns()
clusterinfo <- table(network_table$`__glayCluster`)
clusterinfo##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 18 18 14 1 1 2 1 1 1 1 1 1 1 1
# Save the members of clusters with >5 nodes for subsequent analysis
bigclusters <- names(clusterinfo)[(table(network_table$`__glayCluster`) > 5)]
bigclusternames <- paste0("cluster", bigclusters)
# empty list of clusters to cycle through
clusterlist <- list()
for (i in 1:length(bigclusters)){
# create given clustername
given_clustername <- bigclusternames[i]
# make a list of these lists
clusterlist[[i]] <- network_table$query.term[network_table$`__glayCluster` == bigclusters[i]]
}
names(clusterlist) <- bigclusternamesMembers of each cluster
## $cluster1
## [1] "TNFRSF1A" "CD93" "EMP1" "ICAM1" "FCGR2A" "S100A11"
## [7] "VCAM1" "TLR3" "TNFRSF11B" "OLR1" "ANXA2" "FCGR1A"
## [13] "IL13RA1" "IL1R1" "CD44" "MSR1" "ANGPT1" "MYD88"
##
## $cluster2
## [1] "CD74" "FGL2" "RARRES3" "LYZ" "FCER1G" "CCR1"
## [7] "MS4A7" "C1QB" "MS4A4A" "NCF2" "FCGR3A" "CYBB"
## [13] "IFI30" "CD14" "HLA-DPA1" "MS4A6A" "PTAFR" "IFITM2"
##
## $cluster3
## [1] "HMOX1" "CCL2" "C3" "GFAP" "CP" "SCIN"
## [7] "C7" "TREM2" "STEAP3" "C4A" "C4B" "SERPINA1"
## [13] "SERPINA3" "APLNR"
library(clusterProfiler)
library(org.Hs.eg.db)
# Run enrichment analysis on each cluster
GO_BP_enrichments <- lapply(clusterlist, function(cluster) {
enrichGO(gene = cluster,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.01)
})Create dotplots for results
for (x in 1:length(GO_BP_enrichments)){
# pick out object with enrichments
enrichresult <- GO_BP_enrichments[[x]]
# make dotplot object
p <- dotplot(object = enrichresult) +
ggtitle(paste0("ORA BP for ", names(clusterlist)[x]))
print(p)
}# Run enrichment analysis on each cluster
GO_MF_enrichments <- lapply(clusterlist, function(cluster) {
enrichGO(gene = cluster,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "MF",
pAdjustMethod = "BH",
qvalueCutoff = 0.01)
})Create dotplots for results
for (x in 1:length(GO_MF_enrichments)){
# pick out object with enrichments
enrichresult <- GO_MF_enrichments[[x]]
# make dotplot object
p <- dotplot(object = enrichresult) +
ggtitle(paste0("ORA MF for ", names(clusterlist)[x]))
print(p)
}library(clusterProfiler)
library(org.Hs.eg.db)
# Run enrichment analysis on each cluster
GO_CC_enrichments <- lapply(clusterlist, function(cluster) {
enrichGO(gene = cluster,
OrgDb = org.Hs.eg.db,
keyType = "SYMBOL",
ont = "CC",
pAdjustMethod = "BH",
qvalueCutoff = 0.01)
})Create dotplots for results
for (x in 1:length(GO_CC_enrichments)){
# pick out object with enrichments
enrichresult <- GO_CC_enrichments[[x]]
# make dotplot object
p <- dotplot(object = enrichresult) +
ggtitle(paste0("ORA CC for ", names(clusterlist)[x]))
print(p)
}